# -*- coding: utf-8 -*-
import re
from datetime import datetime

import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.http import Request
from scrapy.utils.project import get_project_settings

from SinaWeiboSpider.items import AccountInformation, WeiboItem, RelationshipsItem, CommentItem


class WeiboSpider(scrapy.Spider):
    name = 'sina_account'
    allowed_domains = ['weibo.cn']
    base_url = 'https://weibo.cn/'

    def start_requests(self):
        start_uids = [
            # '2803301701',  # 人民日报
            # '5442433081',  # 张亚飞
        ]
        for uid in start_uids:
            yield Request(url=f'{self.base_url}{uid}/info', callback=self.parse)

    def parse(self, response):
        information_item = AccountInformation()
        information_item['crawl_time'] = datetime.now().strftime('%Y-%m-%d %X')
        information_item['account_id'] = re.findall('(\d+)/info', response.url)[0]
        text = ";".join(response.xpath('body/div[@class="c"]//text()').extract())  # 获取标签里的所有text()
        nick_name = re.findall('昵称;?[：:]?(.*?);', text)
        gender = re.findall('性别;?[：:]?(.*?);', text)
        place = re.findall('地区;?[：:]?(.*?);', text)
        brief_introduction = re.findall('简介;?[：:]?(.*?);', text)
        birthday = re.findall('生日;?[：:]?(.*?);', text)
        sex_orientation = re.findall('性取向;?[：:]?(.*?);', text)
        sentiment = re.findall('感情状况;?[：:]?(.*?);', text)
        vip_level = re.findall('会员等级;?[：:]?(.*?);', text)
        authentication = re.findall('认证;?[：:]?(.*?);', text)
        labels = re.findall('标签;?[：:]?(.*?)更多>>', text)
        if nick_name and nick_name[0]:
            information_item["nick_name"] = nick_name[0].replace(u"\xa0", "")
        else:
            information_item["nick_name"] = '无'
        if gender and gender[0]:
            information_item["gender"] = gender[0].replace(u"\xa0", "")
        else:
            information_item["gender"] = '无'
        if place and place[0]:
            place = place[0].replace(u"\xa0", "").split(" ")
            information_item["province"] = place[0]
            if len(place) > 1:
                information_item["city"] = place[1]
            else:
                information_item['city'] = '无'
        else:
            information_item["province"] = '无'
            information_item['city'] = '无'
        if brief_introduction and brief_introduction[0]:
            information_item["brief_introduction"] = brief_introduction[0].replace(u"\xa0", "")
        else:
            information_item["brief_introduction"] = '无'
        if birthday and birthday[0]:
            information_item['birthday'] = birthday[0]
        else:
            information_item["birthday"] = '无'
        if sex_orientation and sex_orientation[0]:
            if sex_orientation[0].replace(u"\xa0", "") == gender[0]:
                information_item["sex_orientation"] = "同性恋"
            else:
                information_item["sex_orientation"] = "异性恋"
        else:
            information_item["sex_orientation"] = '无'
        if sentiment and sentiment[0]:
            information_item["sentiment"] = sentiment[0].replace(u"\xa0", "")
        else:
            information_item["sentiment"] = '无'
        if vip_level and vip_level[0]:
            information_item["vip_level"] = vip_level[0].replace(u"\xa0", "")
        else:
            information_item["vip_level"] = '无'
        if authentication and authentication[0]:
            information_item["authentication"] = authentication[0].replace(u"\xa0", "")
        else:
            information_item["authentication"] = '无'
        if labels and labels[0]:
            information_item["labels"] = labels[0].replace(u"\xa0", ",").replace(';', '').strip(',')
        else:
            information_item["labels"] = '无'
        request_meta = {'item': information_item}
        yield Request(self.base_url + '/u/{}'.format(information_item['account_id']),
                      callback=self.parse_account_info2,
                      meta=request_meta, dont_filter=True, priority=1)

    def parse_account_info2(self, response):
        text = response.text
        information_item = response.meta['item']
        weibo_num = re.findall('微博\[(\d+)\]', text)
        if weibo_num:
            information_item['weibo_num'] = int(weibo_num[0])
        follows_num = re.findall('关注\[(\d+)\]', text)
        if follows_num:
            information_item['follows_num'] = int(follows_num[0])
        fans_num = re.findall('粉丝\[(\d+)\]', text)
        if fans_num:
            information_item['fans_num'] = int(fans_num[0])
        yield information_item


if __name__ == '__main__':
    process = CrawlerProcess(get_project_settings())
    process.crawl('sina_account')
    process.start()
